# Creates a list containing testing and training dataframes
createTraining <- function(data, seed = 123, trainPercent = 0.8) {
  set.seed(seed)
  n <- nrow(data)
  
  numTrain <- floor(trainPercent * n)
  trainingRows <- sample(1:n, size = numTrain, replace = FALSE)
  
  trainingData <- data[trainingRows, ]
  testingData <- data[-trainingRows, ]
  
  return(list(training = trainingData, testing = testingData))
}

# Creates confidence and prediction intervals
jjIntervals <-  function(data, model) {
  
  confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>% 
    rename(confLwr = lwr, confUpr = upr)
  
  prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
    rename(predictLwr = lwr, predictUpr = upr) %>%
    select(predictLwr, predictUpr)
  
  intervalData <- cbind(data,confidence,prediction)
  
  return(intervalData)
}

# Creates a density plot given parameters
jjplotDensity <- function(data,x,fill,color) {
  plot <- ggplot(data, aes(x={{x}})) +
    geom_density(aes(fill={{fill}}), alpha=0.4)+
    geom_rug(aes(color={{color}}), y=0) +
    theme_custom() +
    theme(legend.position = "none")
  return(plot)
}

# Creates a boxplot given parameters
jjplotBoxplot <- function(data,x,y,fill) {
  plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, fill = {{fill}})) +
    geom_boxplot() +
    coord_flip() +
    theme_custom() +
    theme(legend.position = "none")
  return(plot)
}

# Creates a scatter plot
jjplotPoint <- function(data,x,y,color, model) {
  data <- jjIntervals(data,model)
  plot <- ggplot(data=data, aes(x = {{x}}, y = {{y}}, color = {{color}})) +
    geom_point() +
    geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
    geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
    geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
    geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
    geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
    geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75) +
    theme_custom()
  return(plot)
}

# creates an exponential scatter plot
jjplotLogPoint <- function(data, x, y, color, model) {
  data <- jjIntervals(data, model)
  plot <- ggplot(data = data, aes(x = {{ x }}, y = {{ y }}, color = {{ color }})) +
    geom_jitter() +
    geom_ribbon(aes(ymin = 10^confLwr, ymax = 10^confUpr), fill = "yellow", alpha = 0.4) +
    geom_line(aes(y = 10^fit), color = "#3366FF", size = 0.75) +
    geom_line(aes(y = 10^confLwr), linetype = "dashed", size = 0.75) +
    geom_line(aes(y = 10^confUpr), linetype = "dashed", size = 0.75) +
    geom_line(aes(y = 10^predictLwr), linetype = "dashed", color = "red", size = 0.75) +
    geom_line(aes(y = 10^predictUpr), linetype = "dashed", color = "red", size = 0.75)
  return(plot)
}

# Checks a model and gives back error and p-value
checkModel <- function(data, matrix) {
  n <- nrow(data)
  error <- (matrix[1,2] + matrix[2,1])/n
  
  pHat <- (matrix[2,1]+matrix[2,2])/n
  
  standardError <- sqrt(pHat*(1-pHat)/n)
  
  pValue <- pnorm(error,pHat,standardError)
  
  return(list("error" = error,"pValue" = pValue))
}

# Gets a prediction from the model
getPredict <- function(data, model) {
  
  recidPredict <- predict.glm(model, newdata=data, type="response")
  dataWithPredictions <- cbind(data,recidPredict)
  
  return(dataWithPredictions)
}

# RMSE calculation given two lists of numbers
RMSE <- function(predict, obs) {
  RMSE <- sqrt(mean((predict - obs)^2, na.rm = TRUE))
  return(RMSE)
}

# Jaden-Jake Intervals, makes confidence and prediction intervals for a model on data
jjIntervals <- function(data, model) {
  confidence <- as.data.frame(predict.lm(model, newdata = data, interval = "confidence")) %>%
    rename(confLwr = lwr, confUpr = upr)
  
  prediction <- as.data.frame(predict.lm(model, newdata = data, interval = "prediction")) %>%
    rename(predictLwr = lwr, predictUpr = upr) %>%
    select(predictLwr, predictUpr)
  
  intervalData <- cbind(data, confidence, prediction)
  
  return(intervalData)
}
# full data set
recid <- read.csv("datasets/Project3Sample4000.csv")

# mystery project data
recidMystery <- read.csv("datasets/Project3Mystery100.csv")

# male female split
recidMale <- read.csv("datasets/Project3Males1500.csv")
recidFemale <- read.csv("datasets/Project3Females1500.csv")

Introduction to the Project

Recidivism is a term used within the criminal justice system which means “the tendency of a criminal to reoffend after serving a sentence in a disciplinary institution.” The data we will be analyzing is from Broward County, Florida and includes recidivism predictions from two proprietary tests that are given to inmates. These predictions are in the form of decile scores which are attributed to an inmates recidivation likelihood and violence level. The goal of this analysis is to, based on various factors of an inmate, predict whether or not they will reoffend within two years of being released as well as enforce or debunk the tests used to classify this from those proprietary organizations.

By the end of this analysis, we hope to have an accurate classification model for whether or not a person is likely to reoffend as well as have the ability to discuss the accuracy of the model in detail.

Finally, we hope to understand the ethical implications of the model we make and to know how to mitigate and/or measure the biases held by the model itself.


Task 1: Data Cleaning

This task revolves around visualizing the data and making the data we are given usable. We clean the full data set found in the Project3Sample4000.csv file. This includes data cleaning, feature engineering, and data refining along with the creation of a testing training split.

## Data Cleaning
recid2 <- recid %>%
  rename(
    dayBefScreenArrest = days_b_screening_arrest,
    jailIn = c_jail_in,
    jailOut = c_jail_out,
    daysFromCompas = c_days_from_compas,
    chargeDegree = c_charge_degree,
    chargeDesc = c_charge_desc,
    riskRecidDecileScore = RiskRecidDecileScore,
    riskRecidScoreLevel = RiskRecidScoreLevel,
    riskRecidScreeningDate = RiskRecidScreeningDate,
    riskViolenceDecileScore = RiskViolenceDecileScore,
    riskViolenceScoreLevel = RiskViolenceScoreLevel
  ) %>%
  mutate(
    dob = as_date(dmy(dob)),
    ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
    race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
    jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
    jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
    chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
    riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
    riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
    recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
  ) %>%
  select(-name, -dob) %>%
  filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))

## Data Engineering

recid3 <- recid2 %>%
  mutate(
    daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
    logDaysInJail = log10(daysInJail),
    logPriorsCount = log10(priorsCount + 0.1),
    chargeDescCount = str_count(chargeDesc),
    logChargeDescCount = log10(chargeDescCount+0.1),
    juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
    logJuvCount = log10(juvCount + 0.1)
  )

# Category Removal

recid4 <- recid3 %>%
  select(
    -race
  )

## Testing Training Split for Logistic Models

testingTraining <- createTraining(recid4, seed=8675309)

recidTraining <- testingTraining$training

recidTesting <-  testingTraining$testing

## Testing Training Split for Multiple Regression Models

testingTraining2 <- createTraining(recid3, seed = 859)

recidTraining2 <- testingTraining2$training

recidTesting2 <- testingTraining2$testing



# Mystery Data Matching

recidMystery2 <- recidMystery %>%
  rename(
    dayBefScreenArrest = days_b_screening_arrest,
    jailIn = c_jail_in,
    jailOut = c_jail_out,
    daysFromCompas = c_days_from_compas,
    chargeDegree = c_charge_degree,
    chargeDesc = c_charge_desc,
    riskRecidScreeningDate = RiskRecidScreeningDate,
  ) %>%
  mutate(
    dob = as_date(dmy(dob)),
    ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
    race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
    jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
    jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
    chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
    riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
  ) %>%
  select(-dob) %>%
  filter((!is.na(jailIn) | !is.na(jailOut)))

recidMystery3 <- recidMystery2 %>%
  mutate(
    daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
    logDaysInJail = log10(daysInJail),
    logPriorsCount = log10(priorsCount + 0.1),
    juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
    logJuvCount = log10(juvCount + 0.1)
  )


# Male Vs. Female Data and Cleaning
recidMale2 <- recidMale %>%
  rename(
    dayBefScreenArrest = days_b_screening_arrest,
    jailIn = c_jail_in,
    jailOut = c_jail_out,
    daysFromCompas = c_days_from_compas,
    chargeDegree = c_charge_degree,
    chargeDesc = c_charge_desc,
    riskRecidDecileScore = RiskRecidDecileScore,
    riskRecidScoreLevel = RiskRecidScoreLevel,
    riskRecidScreeningDate = RiskRecidScreeningDate,
    riskViolenceDecileScore = RiskViolenceDecileScore,
    riskViolenceScoreLevel = RiskViolenceScoreLevel
  ) %>%
  mutate(
    dob = as_date(dmy(dob)),
    ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
    race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
    jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
    jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
    chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
    riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
    riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
    recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
  ) %>%
  select(-name, -dob) %>%
  filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))

recidFemale2 <- recidFemale %>%
  rename(
    dayBefScreenArrest = days_b_screening_arrest,
    jailIn = c_jail_in,
    jailOut = c_jail_out,
    daysFromCompas = c_days_from_compas,
    chargeDegree = c_charge_degree,
    chargeDesc = c_charge_desc,
    riskRecidDecileScore = RiskRecidDecileScore,
    riskRecidScoreLevel = RiskRecidScoreLevel,
    riskRecidScreeningDate = RiskRecidScreeningDate,
    riskViolenceDecileScore = RiskViolenceDecileScore,
    riskViolenceScoreLevel = RiskViolenceScoreLevel
  ) %>%
  mutate(
    dob = as_date(dmy(dob)),
    ageCat = factor(as.factor(ageCat), levels = c("Less than 25", "25 - 45", "Greater than 45")),
    race = factor(as.factor(race), levels = c("white", "black", "hispanic", "other")),
    jailIn = as.Date(dmy_hm(jailIn, tz = "EST")),
    jailOut = as.Date(dmy_hm(jailOut, tz = "EST")),
    chargeDegree = as.factor(gsub("[()]", "", chargeDegree)),
    riskRecidScoreLevel = as.factor(riskRecidScoreLevel),
    riskRecidScreeningDate = as_date(dmy(riskRecidScreeningDate)),
    recidCat = fct_recode(as.factor(isRecid), Yes = "1", No = "0")
  ) %>%
  select(-name, -dob) %>%
  filter(!is.na(isRecid) & (!is.na(jailIn) | !is.na(jailOut)))

recidMale3 <- recidMale2 %>%
  mutate(
    daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
    logDaysInJail = log10(daysInJail),
    logPriorsCount = log10(priorsCount + 0.1),
    juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
    logJuvCount = log10(juvCount + 0.1)
  )

recidFemale3 <- recidFemale2 %>%
  mutate(
    daysInJail = as.numeric(difftime(jailOut, jailIn, unit = "days") + 1),
    logDaysInJail = log10(daysInJail),
    logPriorsCount = log10(priorsCount + 0.1),
    juvCount = juvFelonyCount + juvMisdemeanerCount + juvOtherCount,
    logJuvCount = log10(juvCount + 0.1)
  )

maleTestingTraining <- createTraining(recidMale3, seed = 123)
maleTraining <- maleTestingTraining$training
maleTesting <- maleTestingTraining$testing

femaleTestingTraining <- createTraining(recidFemale3, seed = 123)
femaleTraining <- femaleTestingTraining$training
femaleTesting <- femaleTestingTraining$testing

# Data for Task 4

jailTimeTestingTraining <-  createTraining(recid4, seed=123)

jailTimeTraining <- jailTimeTestingTraining$training
jailTimeTesting <- jailTimeTestingTraining$testing

In our data cleaning, we forced date variables to be stored as dates and factor variables to be factors. Then we engineered a few categories in the data - some for convenience and some for purpose. These include the following: daysInJail (difference between entry and exit of jail), logDaysInJail (log base 10 of daysInJail), logPriorsCount (log base 10 of priorsCount), juvCount (total of all juvenile crime categories), and logJuvCount (log base 10 of juvCount). We then remove name, dob, and race because name is irrelevant to recidivism, dob is covered by the included age category, and race is not fair to include in a predictive model as there is no definitive difference aside from visually between two people of different races.

Task 2: Statistical Modeling!

In this section we will be creating three different models based on the data set given and now refined. Our first two models will be logistic regression models to predict whether or not an inmate will reoffend within two years of their release both including and excluding the proprietary test scores included in the data. Our third model will predict the risk of recidivism score. This score was given by a written test given to prisoners and was computed by a third-party company that uses a black-box algorithm to compute the value of riskRecidDecileLevel. Our fourth model will predict the violence score of an inmate (riskViolenceDecileScore. This was once more calculated from a test administered to inmates and was computed by a third-party company using a black-box algorithm.

Logistic Regression 1: Predicting Recidivism without Black-Box Predictors in Data

This is our first model. It is a logistic regression (classification) model which predicts whether or not an inmate will reoffend within two years of being released. The following plots are some data visualization relevant to this model.

Relevant Data Visualizations

Figure 1

### DaysInJail Plot

p1 <- recidTraining %>% 
  jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="Days in Jail",
    x = "Days in Jail"
  )

p2 <- recidTraining %>% 
  jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="Log10 of Days in Jail",
    x = "log10(daysInJail)"
  )

p3 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=daysInJail, fill=as.factor(isRecid)) +
  labs(
    title="Days in Jail",
    y = "Days in Jail",
    x = "Reoffence Prediction Proportion"
  )

p4 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=logDaysInJail, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
  labs(
    title="Log Base 10 of Days in Jail",
    y = "log10(daysInJail)",
    x = "Reoffence Prediction Proportion",
    fill = "Reoffence Prediction Proportion"
  )

p1 + p2 + p3 + p4 + 
  plot_annotation(
    title = "Days in Jail and log10(Days in Jail)",
    theme=theme_custom()
  ) + plot_layout(guides = 'collect')

Figure 1 illustrates that a higher proportion of inmates who spent less time in prison when compared to the proportion of prisoners likely to reoffend who spent a longer duration in jail. This is shown more clearly in the right two plots as the left two plots are so heavily skewed left that they are not very readable. The left two plots are included to demonstrate that taking the log base 10 of daysInJail eliminates much of the leftward skew therefore being a more sensitive predictor to be included within a model than simply daysInJail.

Figure 2

### Priors Count

p5 <- recidTraining %>% 
  jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="Priors Counts",
    x = "Priors Counts"
  )
p6 <- recidTraining %>% 
  jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="log10(Priors Counts + 0.1)",
    x = "log10(Priors Counts + 0.1)"
  )
p7 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=priorsCount, fill=as.factor(isRecid)) +
  labs(
    title="Priors Counts",
    y = "Priors Counts",
    x = "Recidivated",
    fill = "Recidivated"
  )
p8 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=logPriorsCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title="log10(Priors Counts + 0.1)",
    y = "log10(Priors Counts + 0.1)",
    x = "Recidivated",
    fill = "Recidivated"
  )
p5 + p6 + p7 + p8 + plot_annotation(title = "Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')

Figure 2 illustrates that taking the log base 10 of the number of prior offenses (plus 0.1 to avoid taking the log10 of 0) improves the sensitivity of the predictions made with that variable as well as reducing the number of outliers included in the data which means the model will better predict whether or not an inmate will reoffend after being released. For this reason, we will be using the log base 10 of prior count as opposed to just priorCount.

Figure 3

### Juvenile Priors Count

p9 <- recidTraining %>% 
  jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="Juvenile Priors Counts",
    x = "Juvenile Priors Counts"
  )
p10 <- recidTraining %>% 
  jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="log10(Juvenile Priors Counts + 0.1)",
    x = "log10(Juvenile Priors Counts + 0.1)"
  )
p11 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=juvCount, fill=as.factor(isRecid)) +
  labs(
    title="Juvenile Priors Counts",
    y = "Juvenile Priors Counts",
    x = "Recidivated",
    fill = "Recidivated"
  )
p12 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=logJuvCount, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title="log10(Juvenile Priors Counts + 0.1)",
    y = "log10(Juvenile Priors Counts + 0.1)",
    x = "Recidivated",
    fill = "Recidivated"
  )
p9 + p10 + p11 + p12 + plot_annotation(title = "Juvenile Priors Counts", theme=theme_custom()) + plot_layout(guides = 'collect')

Figure 3 shows that we can rule out prior crimes committed in juvenile years. There was not enough relevant data in these plots to include the variable or the log base 10 of the variable; therefore, this plot is important to include as a justification of refining our model later on. This is one of the few instances where we could simply rule out a variable this early in the process.

Figure 4

### Age

p13 <- recidTraining %>% 
  jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title="Age",
    x = "Age"
  )

p14 <- recidTraining %>% 
  jjplotBoxplot(x = isRecid, y=age, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title="Age",
    y = "Age",
    x = "Recidivated",
    fill = "Recidivated"
  )

p13 / p14 + plot_annotation(theme=theme_custom())

Figure 4 shows that age is a factor in whether or not a person will reoffend. Shown by the density and boxplots, we can determine that in this data, there is a higher proportion of reoffenders in younger populations and a higher proportion of non-reoffenders in older populations. The age cutoff in the trend is 35 years old. This informs us that we could potentially use age as a predictor in the model to predict recidivism.

Figure 5

### Sex

ggplot(data=recidTraining,aes(x=sex, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
  geom_bar(position = "dodge") +
  labs(
    title="Sex",
    x = "Sex",
    fill = "Recidivated"
  ) +
  theme_custom()

Figure 5 takes into account sex as a factor determining reoffending rates. This plot, regarding reoffending rates between male and female populations, shows us that men have a higher probability in relation to male populations to reoffend than women have in relation to the population of women. This means that this variable would most likely be useful to the model; however, sex along with race will not be included for ethical reasons as it is not ethical or fair to judge someone more harshly based on their sex or race.

Figure 6

### ChargeDegree

ggplot(data=recidTraining,aes(x=chargeDegree, fill=fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
  geom_bar(position = "dodge") +
  labs(
    title="Charge Degree",
    x = "Charge Degree",
    fill = "Recidivated"
  ) +
  theme_custom()

Figure 6 illustrates the difference in recidivism rate based on charge degree. Despite these being ordered properly from most severe crime to least severe crime, there does not seem to be a consistent pattern by eye. This means we can leave this variable in the initial model but it may be refined out later.

Figure 7

### Colinearity Check

p15 <- ggplot(recidTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title="log10(Days In Jail) vs log10(Priors Count)",
    x = "log10(Days In Jail)",
    y = "log10(Priors Count)",
    color = "Recidivated"
  ) +
  theme_custom()

p16 <- ggplot(recidTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title="log10(Days In Jail) vs Age",
    x = "log10(Days In Jail)",
    y = "Age",
    color = "Recidivated"
  ) +
  theme_custom()

p17 <- ggplot(recidTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid),Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title="log10(Priors Count) vs Age",
    x = "log10(Priors Count)",
    y = "Age",
    color = "Recidivated"
  ) +
  theme_custom()

p15 / (p16 + p17) + plot_annotation(title = "Colinearity Check", theme=theme_custom()) + plot_layout(guides = 'collect')

Figure 7 shows us that none of the three variables (age, logDaysInJail, and logPriorsCount) are colinear. The three subplots in conjunction show us that each of these three variables is not colinear with any of the others, and, therefore, all of them may be included in the model to potentially be refined out later.

The Model

## Everything Model

everythingModel <- glm(isRecid ~  age + priorsCount + daysInJail + logPriorsCount + logDaysInJail,data=recidTraining, family = binomial)

everythingPredictTrain <- getPredict(recidTraining, everythingModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTrain <- table(everythingPredictTrain$isRecid,everythingPredictTrain$prediction)

everythingPredictTest <- getPredict(recidTesting, everythingModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
everythingMatrixTest <- table(everythingPredictTest$isRecid,everythingPredictTest$prediction)

#checkModel(recidTraining,everythingMatrixTrain)

#checkModel(recidTesting,everythingMatrixTest)

## Basic Model

baseModel <- glm(isRecid ~ sex + age + priorsCount + daysInJail,data=recidTraining, family = binomial)

basePredictTrain <- getPredict(recidTraining, baseModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTrain <- table(basePredictTrain$isRecid,basePredictTrain$prediction)

basePredictTest <- getPredict(recidTesting, baseModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
baseMatrixTest <- table(basePredictTest$isRecid,basePredictTest$prediction)

#checkModel(recidTraining,baseMatrixTrain)

#checkModel(recidTesting,baseMatrixTest)

## Log Model

logModel <- glm(isRecid ~ sex + age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)

logPredictTrain <- getPredict(recidTraining, logModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTrain <- table(logPredictTrain$isRecid,logPredictTrain$prediction)

logPredictTest <- getPredict(recidTesting, logModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
logMatrixTest <- table(logPredictTest$isRecid,logPredictTest$prediction)

#checkModel(recidTraining,logMatrixTrain)

#checkModel(recidTesting,logMatrixTest)

## Recid Model

recidModel <- glm(isRecid ~ sex + age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)

recidPredictTrain <- getPredict(recidTraining, recidModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTrain <- table(recidPredictTrain$isRecid,recidPredictTrain$prediction)

recidPredictTest <- getPredict(recidTesting, recidModel) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recidMatrixTest <- table(recidPredictTest$isRecid,recidPredictTest$prediction)

#checkModel(recidTraining,recidMatrixTrain)

#checkModel(recidTesting,recidMatrixTest)

## Recid 2 Model

recid2Model <- glm(isRecid ~ age + logPriorsCount + daysInJail, data=recidTraining, family = binomial)

recid2PredictTrain <- getPredict(recidTraining, recid2Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTrain <- table(recid2PredictTrain$isRecid,recid2PredictTrain$prediction)

recid2PredictTest <- getPredict(recidTesting, recid2Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid2MatrixTest <- table(recid2PredictTest$isRecid,recid2PredictTest$prediction)

#checkModel(recidTraining,recid2MatrixTrain)

#checkModel(recidTesting,recid2MatrixTest)

## Recid 3 Model

recid3Model <- glm(isRecid ~ age + logPriorsCount, data=recidTraining, family = binomial)

recid3PredictTrain <- getPredict(recidTraining, recid3Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTrain <- table(recid3PredictTrain$isRecid,recid3PredictTrain$prediction)

recid3PredictTest <- getPredict(recidTesting, recid3Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid3MatrixTest <- table(recid3PredictTest$isRecid,recid3PredictTest$prediction)

#checkModel(recidTraining,recid3MatrixTrain)

#checkModel(recidTesting,recid3MatrixTest)

## Recid 4 Model

recid4Model <- glm(isRecid ~ age + sex + logPriorsCount, data=recidTraining, family = binomial)

recid4PredictTrain <- getPredict(recidTraining, recid4Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTrain <- table(recid4PredictTrain$isRecid,recid4PredictTrain$prediction)

recid4PredictTest <- getPredict(recidTesting, recid4Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid4MatrixTest <- table(recid4PredictTest$isRecid,recid4PredictTest$prediction)

#checkModel(recidTraining,recid4MatrixTrain)

#checkModel(recidTesting,recid4MatrixTest)

recidMysteryBox <- read.csv("datasets/Project3Mystery100.csv")

## Recid 5 Model

recid5Model <- glm(isRecid ~ age + priorsCount, data=recidTraining, family = binomial)

recid5PredictTrain <- getPredict(recidTraining, recid5Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTrain <- table(recid5PredictTrain$isRecid,recid5PredictTrain$prediction)

recid5PredictTest <- getPredict(recidTesting, recid5Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid5MatrixTest <- table(recid5PredictTest$isRecid,recid5PredictTest$prediction)

#checkModel(recidTraining,recid5MatrixTrain)

#checkModel(recidTesting,recid5MatrixTest)

## Recid 6 Model

recid6Model <- glm(isRecid ~ age + logPriorsCount + logDaysInJail, data=recidTraining, family = binomial)

recid6PredictTrain <- getPredict(recidTraining, recid6Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTrain <- table(recid6PredictTrain$isRecid,recid6PredictTrain$prediction)

recid6PredictTest <- getPredict(recidTesting, recid6Model) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reoffend", "Reoffended"))
recid6MatrixTest <- table(recid6PredictTest$isRecid,recid6PredictTest$prediction)



#stored for use in markdown
trainList6 <- checkModel(recidTraining,recid6MatrixTrain)
trainError <- trainList6$error
trainPVal <- trainList6$pValue

testList6 <- checkModel(recidTesting,recid6MatrixTest)
testError <- testList6$error
testPVal <- testList6$pValue

matrix = recid6MatrixTest
matrix1 = recid6MatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)

coeffs = recid6Model$coefficients

nullHyp = (matrix1[2,1]+matrix1[2,2])/nrow(recidTraining)

canWe = ifelse(testPVal > 0.05, "can", "cannot")

continueAnalysis = ifelse(testPVal > 0.05, "should continue analyzing this data and other related data to create a statistically significant model", "should proceed to propose a new hypothesis which we may test in future analysis")

## Final Model

finalModel <-  recid6Model
# explain equation, training error, training confusion matrix, explain confusion matrix, same for testing, p-value, null hypothesis

The model we made from this data to predict whether or not an inmate will reoffend without the use of the proprietary recidivation and violence scores included in the data is as follows:

\[willReoffend = e^{0.596374} \times e^{-0.0367996 \cdot age} \times \]

\[e^{0.8183817 \cdot log_{10}(priorsCount)} \times e^{0.0302903 \cdot log_{10}(daysInJail)}\]

The model equation above has a mixed pairing of one floor and one ceiling symbol, this is intended to communicate proper rounding.

In this model we began with every relevant variable (excluding the proprietary scores which we will include in the next model) which could reliably build a model when testing-training splits were made at random (this excluded the categorical variable chargeDegree due to its limited number of data points per category and chargeDesc as it was too much work to refine and had a very limited number of data points per category). We also chose to exclude race and sex due to the ethical concerns of including factors which would theoretically have no impact on the model considering equality between races and genders. From our everything model, we ruled out more factors due to their low significance and chose to include the log base 10 of certain variables as justified by the plots included in figures 1-7 where we discussed the log base 10 variants of these variables. After this, we gradually removed one variable at a time until we came to the minimal number of variables that still had a highly accurate prediction which resulted in the model shown above.

Some model coefficients listed above are a bit obscure in their effect. For example, the coefficients of the logged values indicate that the prediction (ideally between zero and one but rounded) will go up or down according to every power of ten included in the original variable. This is in contrast to the coefficient for the linear term, age, which indicates that the prediction will go down multiplicatively by the exponentiation of 0.0367996 likelihood to reoffend for every year of age the inmate has. These variables are all exponentiated to reflect the model utilizing the process of logistic regression.

This model, when demonstrated on the data it was trained on, has a prediction error of \(0.306875\) meaning that approximately \(30.69\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(30.69\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.

This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{705 + 277}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.306875.

This model, when demonstrated on testing data, has a prediction error of \(0.27875\) meaning that approximately \(27.88\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(27.88\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.

This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorProportion = \frac{150 + 73}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.27875.

This model’s p-value regarding the testing data is \(0.001304\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(34.91\%\) of inmates will reoffend as this is the percentage of inmates in the data that do reoffend. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion. The p-value acquired from this hypothesis test tells us we cannot reject the null hypothesis meaning we should proceed to propose a new hypothesis which we may test in future analysis.

Logistic Regression 2: Predicting Recidivism with Black-Box Predictors in Data

This is our second model. It is a logistic regression (classification) model which predicts whether or not an inmate will reoffend within two years of being released. This time we are including in our model the black-box scores (riskRecidDecileScore and riskViolenceDecileScore) which are included in the data and were computed through use of a proprietary algorithm from a third party. The plots made for the prior model are all still relevant in this case. The following plot is an addition to the prior grouping which incorporate our new relevant predictor, riskRecidDecileScore.

Relevant Data Visualizations

Figure 8

### Risk Recid Score
recidTraining %>% 
  jjplotDensity(x = riskRecidDecileScore, fill = fct_recode(as.factor(isRecid),Yes = "1", No = "0")) +
  labs(
    title="Risk Recidivation Score Accuracy in Predicting Recidivation",
    fill = "Recidivated",
    x = "Risk Recidivation Score"
  ) +
  theme_custom()

Figure 8 compares riskRecidDecileScore with the actual recidivation value of the inmate which makes for a nice accuracy comparison. We see in this plot what we would expect: as the recidivation score goes up, the proportion of inmates who reoffend increases in relation to the inmates who do not. This holds true until a recidivation score of roughly 8.0 where both progress downward. This is likely due to less data existing at this extreme; however, it could simply indicate that the model included in the data for riskRecidDecileScore does not accurately predict recidivation in the upper values of their 10 point scale.

The Model

riskModel <- glm(isRecid ~ age + logPriorsCount + logDaysInJail + riskRecidDecileScore + riskViolenceDecileScore, data=recidTraining, family = binomial)
#summary(riskModel)

riskModel2 <- glm(isRecid ~ age + logPriorsCount + logDaysInJail + riskRecidDecileScore, data=recidTraining, family = binomial)
#summary(riskModel2)

riskModel3 <- glm(isRecid ~ age + logPriorsCount + riskRecidDecileScore, data=recidTraining, family = binomial)
#summary(riskModel2)

riskPredictTrain <- getPredict(recidTraining, riskModel3) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
riskMatrixTrain <- table(riskPredictTrain$isRecid,riskPredictTrain$prediction)

riskPredictTest <- getPredict(recidTesting, riskModel3) %>% 
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
riskMatrixTest <- table(riskPredictTest$isRecid,riskPredictTest$prediction)

#(checkModel(recidTraining,riskMatrixTrain))

#(checkModel(recidTesting,riskMatrixTest))

#stored for use in markdown
trainList <- checkModel(recidTraining,riskMatrixTrain)
trainError <- trainList$error
trainPVal <- trainList$pValue

testList <- checkModel(recidTesting,riskMatrixTest)
testError <- testList$error
testPVal <- testList$pValue

matrix = recid6MatrixTest
matrix1 = recid6MatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)

coeffs = recid6Model$coefficients

nullHyp = (matrix[2,1]+matrix[2,2])/nrow(recidTesting)

canWe = ifelse(testPVal > 0.05, "can", "cannot")

continueAnalysis = ifelse(testPVal > 0.05, "should continue analyzing this data and other related data to create a statistically significant model", "should proceed to propose a new hypothesis which we may test in future analysis")

## Final Model

finalModel <-  riskModel3

The model we made from this data to predict whether or not an inmate will reoffend with the use of the proprietary recidivation and violence scores included in the data is as follows:

\[willReoffend = e^{0.596374} \times e^{-0.0367996 \cdot age} \times\] \[e^{0.8183817 \cdot logPriorsCount} \times e^{0.0302903 \cdot riskRecidDecileScore}\]

The model equation above has a mixed pairing of one floor and one ceiling symbol, this is intended to communicate proper rounding.

In this model we began the variables that ultimately were relevant to our last model but now with the added proprietary score variables riskRecidDecileScore and riskViolenceDecileScore. We still chose to exclude race and sex due to the ethical concerns of including factors which would theoretically have no impact on the model considering equality between races and genders. We ended up refining out the violence score and were left with the model above. This was the best model which still included one of the proprietary scores.

Some model coefficients listed above are obscure in their effect. For example, the coefficients of the logged values indicate that the prediction will go up or down according to every power of ten included in the original variable’s value. This is in contrast to the coefficient for the included linear term, age, which indicates that the prediction will change multiplicatively by the exponentiation of -0.0367996 likelihood to reoffend for every year of age the inmate has. These variables are all exponentiated to reflect the model utilizing the process of logistic regression.

This model, when demonstrated on the data it was trained on, has a prediction error of \(0.3075\) meaning that approximately \(30.75\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(30.75\%\) of inmates will be predicted to reoffend when they do not or will be predicted not to reoffend when they do.

This is information which can be found in our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. The combination of these two numbers over the total number of data points gives us our error measurement. \(errorProportion = \frac{705 + 277}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.306875.

This model, when demonstrated on testing data, has a prediction error of \(0.28625\) meaning that approximately \(28.62\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(28.62\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.

This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{150 + 73}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.27875.

This model’s p-value regarding the testing data is \(0.0034229\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(32.88\%\) of inmates will reoffend as this is the percentage of inmates in the data that do reoffend. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion. The p-value acquired from this hypothesis test tells us we cannot reject the null hypothesis meaning we should proceed to propose a new hypothesis which we may test in future analysis.

This model was actually worse than the last model we made. This means that the proprietary scores are not good predictors of recidivism in the context of this data and it is better to disregard them and make a model which is entirely our own to predict this.

Multiple Regression 1: Predicting Risk of Recidivism Decile Score

This is our third model. This model is a multiple regression model attempting to achieve parity with the algorithm which was used to calculate riskRecidDecileScore. The following illustrate some interesting relationships in this data.

Relevant Data Visualizations

Figure 9

## Age Cat vs riskRecidDecileScore
ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~ageCat)+
  labs(
    x = "Risk of Recidivation (Decile Score)",
    fill = "Recidivated",
    title = "Recidivation Decile Score Distribution by Age Category"
  ) +
  theme_custom()

Figure 9 shows the relationship between an inmate’s age category and their riskRecidDecileScore. The plot shows that the recidivation decile score predicts the least accurately in people greater than 45 years old as the blue and red fields should cross at exactly five for perfect accuracy whereas they do not intersect at this point in the age category of 45 or older. The other columns behave properly and as expected. This will be included in our initial model to potentially ruled out later.

Figure 10

## Race vs riskRecidDecileScore

ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~race)+
  labs(
    x = "Risk of Recidivation (Decile Score)",
    fill = "Recidivated",
    title = "Recidivation Decile Score by Race"
  ) +
  theme_custom()

Figure 10 shows the relationship between an inmate’s race and their riskRecidDecileScore. The plot shows that there is a heavy prediction bias for black inmates. This is a very concerning trend. Because we are trying to predict as closely as possible to the proprietary scores included, we will include race in our final model; however, this is not an ethical means of classifying or scoring individuals.

Figure 11

## Charge Degree vs riskRecidDecileScore

ggplot(data = recidTraining2, aes(riskRecidDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~chargeDegree)+
  labs(
    x = "Risk of Recidivation (Decile Score)",
    fill = "Recidivated",
    title = "Recidivation Decile Score by Charge Degree"
  ) +
  theme_custom()

Figure 11 shows the relationship between the degree of an inmate’s criminal charge and their riskRecidDecileScore. This figure shows that there are very different relationships between each charge score and the accuracy of the riskRecidDecileScore. This indicates that chargeDegree is likely an indicator used in the proprietary model and as such we will include it in our model to potentially be refined out later.

The Model

scoreSubsetModel <- lm(riskRecidDecileScore ~ logPriorsCount + priorsCount + age + chargeDegree + logDaysInJail + daysInJail + sex + race, data = recidTraining2)

olsSubset <- ols_step_best_subset(scoreSubsetModel)

scoreFinalModel <- lm(riskRecidDecileScore ~ priorsCount + age + chargeDegree + logDaysInJail + race, data = recidTraining2)

scoreTrainingPredicts <- predict.lm(scoreFinalModel, newdata = recidTraining2)

scoreTestingPredicts <- predict.lm(scoreFinalModel, newdata = recidTesting2)

coeffs = scoreFinalModel$coefficients

trainRMSE <- RMSE(scoreTrainingPredicts, recidTraining2$riskRecidDecileScore)

rmse <- RMSE(scoreTestingPredicts, recidTesting2$riskRecidDecileScore)

corr = cor(scoreTestingPredicts, recidTesting2$riskRecidDecileScore)

Overview

The model given our training data to predict riskRecidDecileScore is as follows:

\[predictedRecidDecileScore = 5.5144564 + 0.2469088 \cdot priorsCount + -0.098403 \cdot age +\] \[0.6184054 \cdot chargeDeg_{F2} + 0.7978429 \cdot chargeDeg_{F3} + -1.2380345 \cdot chargeDeg_{F5} +\] \[0.0477814 \cdot chargeDeg_{F7} + 0.1860794 \cdot chargeDeg_{M1} + 0.3707825 \cdot chargeDeg_{M2} +\] \[1.6725884 \cdot chargeDeg_{MO3} + 0.696719 \cdot chargeDeg_{NI0} + 0.924403 \cdot logDaysInJail +\] \[0.7837351 \cdot race_{black} + -0.3393149 \cdot race_{hispanic} + -0.4842382 \cdot race_{other *non-white*}\] As can be seen, this model has some interesting components. Firstly, the last component is intended to include any race which is not black, hispanic, or white. White was not included in the model meaning that an inmate being white does not affect their recidivism prediction. This does not however mean that we can lump them in with the “other” category. Next, chargeDeg is intended to represent the chargeDegree variable. Most of these components are simply linear, meaning that being part of a specific category or having one extra increment in those variables such as chargeDegree or priorsCount respectively will directly add that value to your recidivation score. The logDaysInJail category on the other hand will add 0.924403 to your recidivation score for every factor of 10 that is included in the standard version of the variable, daysInJail.

This model started with all variables and was refined using the olsrr library to find the best subset model. After that, we constructed our final model and made our predictions to measure RMSE and correlation coefficient with.

Diagnostic Plots

autoplot(scoreFinalModel)[1:2] + theme_custom()

As can be seen in our diagnostic plots. This model is borderline acceptable, but not very good. The residual plot shows a \(-x^3\) pattern meaning there is likely some other factor explaining part of this data. We could not find this factor in the included data. The Q-Q plot shows that this model is fairly reliable unless we are predicting in the lower recidivation score range. From this, we can determine that this data shouldn’t be modeled in this way or is more likely missing a significant predictor to be completely accurate as this model is the best model which could be made that we found within this data.

Correlation Coefficient \(= 0.6864077\)
A correlation coefficient of \(0.6864077\) indicates that \(68.64\%\) of the variation in recidivation score is explained by the model. The closer this is to \(1\) or \(-1\), the better the model fits the data provided and, therefore, the more likely it is to be effective at predicting recidivation score. A value of 0.6864077 is not great, but would indicate that this model can predict recidivation score at the very least within an appropriate range of the actual value. We would need more effective indicators included in the inmate data in order to properly model this data with a higher correlation value.

In relation to our training data, this model has an RMSE of 2.0495382. This means that our predictions have a residual mean square error of 2.0495382 on a 10 point scale. An average of 2 points of error on the training data is quite bad. Let’s move onto the testing RMSE.

In relation to our testing data, this model has an RMSE of 2.0958573. This means that our predictions have a residual mean square error of 2.0958573 on a 10 point scale. An average of 2.1 points of error is not horrible, but it isn’t great either. The testing and training RMSE values don’t differ greatly which indicates that there are predictors missing from this data which are useful in predicting this value. Sadly we did not find a better model for this score which indicates that the model from the proprietary provider likely involved some information which was not included in this data.

Multiple Regression 2: Predicting Risk of Violence Decile Score

This is our fourth model. This model is a multiple regression model attempting to achieve parity with the algorithm which was used to calculate riskViolenceDecileScore. The following plots show some meaningful relationships in the data relevant to this model.

Relevant Data Visualizations

Figure 12

## Age Cat vs riskViolenceDecileScore
ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~ageCat)+
  labs(
    title="Violence Score by Age Category",
    x="Recidivation Violence Decile Score",
    fill = "Recidivated"
  ) +
  theme_custom()

Figure 12 shows different relationships between violence score and reoffence based on age range meaning that age range could be a predictor of violence score. Age category will be included in the initial model to be potentially refined out later.

Figure 13

## Race vs riskViolenceDecileScore

ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~race) +
  labs(
    title="Violence Score by Race",
    x="Recidivation Violence Decile Score",
    fill = "Recidivated"
  ) +
  theme_custom()

Figure 13 shows, similarly to recidivation score, that black inmates have a different distribution of violence scores than other races meaning there could be some insitutional bias included here. Again, this is unethical to include on a normal basis, but because we are trying to best predict the model someone else has already made, we are considering race a potential factor and it will be included in the initial model.

Figure 14

## Charge Degree vs riskViolenceDecileScore

ggplot(data = recidTraining2, aes(riskViolenceDecileScore, fill = recidCat)) +
  geom_density(alpha = .4) +
  facet_wrap(~chargeDegree)+
  labs(
    title="Violence Score by Charge Degree",
    x="Recidivation Violence Decile Score",
    fill = "Recidivated"
  ) +
  theme_custom()

Figure 14 shows that charge degree has different distributions of violence scores meaning that it could be a good predictor of violence score and will be included in the initial model to be potentially refined out later.

The Model

## Best Subset

violenceSubsetModel <- lm(riskViolenceDecileScore ~ logPriorsCount + priorsCount + age + chargeDegree + logDaysInJail + daysInJail + sex + race, data = recidTesting2)

olsSubset <- ols_step_best_subset(violenceSubsetModel)

violenceFinalModel <- lm(riskViolenceDecileScore ~ priorsCount + age + logDaysInJail + race, data = recidTraining2)

violenceTrainingPredicts <- predict.lm(violenceFinalModel, newdata = recidTraining2)

violenceTestingPredicts <- predict.lm(violenceFinalModel, newdata = recidTesting2)

trainRMSE <- RMSE(violenceTrainingPredicts, recidTraining2$riskViolenceDecileScore)

rmse <- RMSE(violenceTestingPredicts, recidTesting2$riskViolenceDecileScore)

corr <- cor(violenceTestingPredicts, recidTesting2$riskViolenceDecileScore)

coeffs <- violenceFinalModel$coefficients

Overview

The model to predict riskViolenceDecileScore is as follows:

\[predictedViolenceScore = 6.7015258 + 0.1191035 \cdot priorsCount +\] \[-0.1227209 \cdot age + 0.7257646 \cdot logDaysInJail + 0.6358164 \cdot race_{black} +\] \[-0.169199 \cdot race_{hispanic} + -0.201816 \cdot race_{other *non-white*}\] Similarly to the last model, this model has some interesting components. Firstly, the last component is intended to include any race which is not black, hispanic, or white. White was not included in the model meaning that an inmate being white does not affect their recidivism prediction. This does not however mean that we can lump them in with the “other” category. Next, chargeDeg is intended to represent the chargeDegree variable. Most of these components are simply linear, meaning that being part of a specific category or having one extra increment in those variables such as race or priorsCount respectively will directly add that value to your recidivation score. The logDaysInJail category on the other hand will add 0.7257646 to your recidivation score for every factor of 10 that is included in the standard version of the variable, daysInJail.

This model started with all variables and was refined using the olsrr library to find the best subset model. After that, we constructed our final model and made our predictions to measure RMSE and correlation coefficient with.

Diagnostic Plots

autoplot(violenceFinalModel)[1:2] + theme_custom()

These diagnostic plots above show us that this model is not great. The residual plot shows a clear trend through all of the data’s range and the Q-Q plot only meets the guideline in the center of the plot at zero, immediately leaving the linear guide when moving left or right. We can see clearly that this data should not be modeled this way or has predictors used in the proprietary algorithm which are not included in the data.

Correlation Coefficient \(= 0.6735538\)
A correlation coefficient of \(0.6735538\) indicates that \(67.36\%\) of the variation in recidivation score is explained by the model. The closer this is to \(1\) or \(-1\), the better the model fits the data provided and, therefore, the more likely it is to be effective at predicting violence score. A value of 0.6735538 is not great, but would indicate that this model can predict violence score at the very least within an appropriate range of the actual value. We would need more effective indicators included in the inmate data in order to properly model this data with a higher correlation value.

In relation to our training data, this model has an RMSE of 1.7819866. This means that our predictions have a residual mean square error of 1.7819866 on a 10 point scale. An average of 1.8 points of error on the training data is quite bad. Let’s move onto the testing RMSE.

In relation to our testing data, this model has an RMSE of 1.8669443. This means that our predictions have a residual mean square error of 1.8669443 on a 10 point scale. An average of 1.9 points of error is not horrible,and is better than our recidivation score model, but it isn’t great either. The testing and training RMSE values don’t differ greatly which indicates that there are predictors missing from this data which are useful in predicting this value. Sadly we did not find a better model for this score which indicates that the model from the proprietary provider likely involved some information which was not included in this data.

Task 3: Split Over Two Sub-Populations

Males Model

This is our classification model which predicts whether or not an inmate will reoffend given the fact that all of the inmates analyzed are male.

Relevant Data Visualizations

Figure 15

### DaysInJail

p25 <- maleTraining %>%
  jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Days in Jail",
    x = "Days in Jail",
  )+
  theme_custom()

p26 <- maleTraining %>%
  jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Days in Jail",
    x = "log10(Days in Jail)",
  )+
  theme_custom()+
  theme(legend.position = "none")

p27 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = daysInJail, fill = as.factor(isRecid)) +
  labs(
    title = "Days in Jail",
    x = "Recidivated",
    y = "Days in Jail"
  )+
  theme_custom() +
  theme(legend.position = "none")

p28 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = logDaysInJail, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +

  labs(
    title = "log10(Days in Jail)",
    x = "Recidivated",
    y = "log10(Days in Jail)",
    fill = "Recidivated"
  )+
  theme_custom()

p25 + p26 + p27 + p28 +
  plot_annotation(
    title = "Days in Jail and log10(Days in Jail)",
    theme=theme_custom()
  ) + plot_layout(guides = "collect")

Figure 15 shows the relationship between daysInJail and whether or not an inmate will reoffend. The figure shows clearly that the log base 10 of daysInJail is more sensitive and effective at predicting whether or not an inmate will reoffend as shown by the lowered skewness of the plot and the lowered number of outliers on the boxplot graph. We will include logDaysInJail in our model with the potential for it to be ruled out later.

Figure 16

### Priors Count

p26 <- maleTraining %>%
  jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Priors Counts",
    x = "Priors Counts"
  )+
  theme_custom()+
  theme(legend.position = "none")

p27 <- maleTraining %>%
  jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Priors Counts) + 0.1",
    x = "log10(Priors Counts) + 0.1"
  )+
  theme_custom()+
  theme(legend.position = "none")

p28 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = priorsCount, fill = as.factor(isRecid)) +
  labs(
    title = "Priors Counts",
    x = "Recidivated",
    y = "Priors Counts",
    fill = "Recidivated"
  )+
  theme_custom()

p29 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = logPriorsCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "log10(Priors Counts) + 0.1",
    x = "Recidivated",
    y = "log10(Priors Counts) + 0.1",
  )+
  theme_custom()+
  theme(legend.position = "none")

p26 + p27 + p28 + p29 + plot_annotation(title = "Male Recidivation by Priors Counts",theme=theme_custom()) + plot_layout(guides = "collect")

Figure 16 shows the relationship between priorsCount and whether or not an inmate roeffended. It is clear by the plots that the log base 10 of priorsCount is a more sensitive and better indicator of whether or not an inmate will reoffend than the standard priorsCount. Therefore, we will include logPriorsCount in our model to be potentially refined out later.

Figure 17

### Juvenile Priors Count

p30 <- maleTraining %>%
  jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Juvenile Priors Counts",
    x = "Juvenile Priors Counts"
  )
p31 <- maleTraining %>%
  jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Juvenile Priors Counts) + 0.1",
    x = "log10(Juvenile Priors Counts) + 0.1"
  )
p32 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = juvCount, fill = as.factor(isRecid)) +
  labs(
    title = "Juvenile Priors Counts",
    x = "Recidivated",
    y = "Juvenile Priors Counts",
    fill = "Recidivated"
  )
p33 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = logJuvCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "log10(Juvenile Priors Counts) + 0.1",
    x = "Recidivated",
    y = "log10(Juvenile Priors Counts) + 0.1",
    fill = "Recidivated"
  )
p30 + p31 + p32 + p33 + plot_annotation(title = "Male Recidivation by Juvenile Priors Counts",theme=theme_custom()) + plot_layout(guides = "collect")

Figure 17 shows us that juvenilePriorCounts is not an effective predictor of recidivation in males as there is an extreme amount of left skew indicating that there is not a lot of variation in data for this category which makes this category rather ineffective to predict anything for this model.

Figure 18

### Age

p34 <- maleTraining %>%
  jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Age",
    x = "Juvenile Priors Counts"
  )

p35 <- maleTraining %>%
  jjplotBoxplot(x = isRecid, y = age, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "Age",
    x = "Recidivated",
    y = "Age",
    fill = "Recidivated"
  )

p34 / p35 + plot_annotation(title = "Male Recidivation by Age", theme=theme_custom())

Figure 18 shows us that age appears to be a factor in whether or not male inmates reoffend. This factor will be included in our initial males model to potentially be refined out later.

Figure 19

### Risk Recid Score
maleTraining %>%
  jjplotDensity(x = riskRecidDecileScore, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0"), color = as.factor(isRecid)) +
  labs(
    title = "Male Recidivation by Risk Recidivation Score",
    x = "Risk Recidivation Decile Score",
    fill ="Recidivated"
  )

Figure 19 shows us that riskRecidivationDecileScore (RRDS) is a fairly good predictor for male inmates; however, like earlier in the bisex model, this falls off around RRDS 8. This will be included as an initial predictor to potentially be refined out later.

Figure 20

### Colinearity Check

p36 <- ggplot(maleTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Days In Jail) vs log10(Priors Count)",
    x = "log10(Days In Jail)",
    y = "log10(Priors Count)",
    color = "Recidivated"
  ) +
  theme_custom()

p37 <- ggplot(maleTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Days In Jail) vs Age",
    x = "log10(Days In Jail)",
    y = "Age",
    color = "Recidivated"
  )+
  theme_custom()

p38 <- ggplot(maleTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Priors Count) vs Age",
    x = "log10(Priors Count)",
    y = "Age",
    color = "Recidivated"
  )+
  theme_custom()

p36 / (p37 + p38) + plot_annotation(title = "Colinearity Check",theme=theme_custom()) + plot_layout(guides = "collect")

Figure 20 is exclusively for the purpose of checking colinearity between continuous variables. Looking at the plot, we can see no relationships whatsoever between variables meaning that these variables are not colinear.

The Male Model

This is the model which we developed for better predicting recidivism in male populations as compared to mixed populations:

maleModel <- glm(isRecid ~ age + priorsCount + daysInJail, data = maleTraining, family = binomial)

malePredictTrain <- getPredict(maleTraining, maleModel) %>%
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
maleMatrixTrain <- table(malePredictTrain$isRecid, malePredictTrain$prediction)

malePredictTest <- getPredict(recidTesting, maleModel) %>%
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
maleMatrixTest <- table(malePredictTest$isRecid, malePredictTest$prediction)

trainList <- checkModel(recidTraining, maleMatrixTrain)
trainError <- trainList$error
trainPVal <- trainList$pvalue

testList <- checkModel(recidTesting, maleMatrixTest)
testError <- testList$error
testPVal <- testList$pvalue

bmodelErrorM <- testError

matrix = maleMatrixTest
matrix1 = maleMatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)

\[willReoffend_{male} = e^{6.7015258} \times e^{6.7015258 \cdot age} \times\] \[e^{-0.1227209 \cdot priorsCount} \times e^{0.7257646 \cdot daysInJail}\]

In this model we began with all variables aside from race and ended up, through the process of refining the model, with the model shown above.

Some model coefficients listed above are obscure in their effect. For example, the coefficients of the logged values indicate that the prediction will go up or down according to every power of ten included in the original variable’s value. This is in contrast to the coefficient for the included linear term, age, which indicates that the prediction will change multiplicatively by the exponentiation of 0.1191035 likelihood to reoffend for every year of age the inmate has. These variables are all exponentiated to reflect the model utilizing the process of logistic regression.

This model, when demonstrated on the data it was trained on, has a prediction error of \(0.1178707\) meaning that approximately \(11.79\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(11.79\%\) of inmates will be predicted to reoffend when they do not or will be predicted not to reoffend when they do.

This is information which can be found in our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. The combination of these two numbers over the total number of data points gives us our error measurement. \(errorProportion = \frac{291 + 81}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.1178707.

This model, when demonstrated on testing data, has a prediction error of \(0.2696203\) meaning that approximately \(26.96\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(26.96\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.

This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{166 + 47}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.2696203.

Females Model

This is our classification model which predicts whether or not an inmate will reoffend given the fact that all of the inmates analyzed are female.

Relevant Data Visualizations

Figure 21

### DaysInJail

p39 <- femaleTraining %>%
  jjplotDensity(x = daysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Days in Jail",
    x = "Days in Jail",
  )

p40 <- femaleTraining %>%
  jjplotDensity(x = logDaysInJail, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Days in Jail",
    x = "log10(Days in Jail)",
  )

p41 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = daysInJail, fill = as.factor(isRecid)) +
  labs(
    title = "Days in Jail",
    y = "Recidivated",
    x = "Days in Jail"
  )

p42 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = logDaysInJail, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +

  labs(
    title = "log10(Days in Jail)",
    x = "Recidivated",
    y = "log10(Days in Jail)",
    fill = "Recidivated"
  )

p39 + p40 + p41 + p42 +
  plot_annotation(
    title = "Days in Jail and log10(Days in Jail)",
    theme=theme_custom()
  ) + plot_layout(guides = "collect")

Figure 21 shows us that, although logDaysInJail is a better predictor of recidivation in female populations than the vanilla daysInJail value, it still is not a great indicator. These plots tell us that we can most likely remove this from our model.

Figure 22

### Priors Count

p43 <- femaleTraining %>%
  jjplotDensity(x = priorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Priors Counts",
    x = "Priors Counts"
  )
p44 <- femaleTraining %>%
  jjplotDensity(x = logPriorsCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Priors Counts + 0.1)",
    x = "log10(Priors Counts + 0.1)"
  )
p45 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = priorsCount, fill = as.factor(isRecid)) +
  labs(
    title = "Priors Counts",
    x = "Recidivated",
    y = "Priors Counts",
    fill = "Recidivated"
  )
p46 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = logPriorsCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "log10(Priors Counts + 0.1)",
    x = "Recidivated",
    y = "log10(Priors Counts + 0.1)",
    fill = "Recidivated"
  )
p43 + p44 + p45 + p46 + plot_annotation(title = "Priors Counts", theme=theme_custom()) + plot_layout(guides = "collect")

Figure 22 shows us that, in female populations, logPriorsCount is a much more effective and inclusive predictor than the standard priorsCount. This plot shows us that we will likely include this in our model as there are no outliers in the logPriorsCount predictor.

Figure 23

### Juvenile Priors Count

p47 <- femaleTraining %>%
  jjplotDensity(x = juvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Juvenile Priors Counts",
    x = "Juvenile Priors Counts"
  )
p48 <- femaleTraining %>%
  jjplotDensity(x = logJuvCount, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "log10(Juvenile Priors Counts + 0.1)",
    x = "log10(Juvenile Priors Counts + 0.1)"
  )
p49 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = juvCount, fill = as.factor(isRecid)) +
  labs(
    title = "Juvenile Priors Counts",
    x = "Recidivated",
    y = "Juvenile Priors Counts",
    fill = "Recidivated"
  )
p50 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = logJuvCount, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "log10(Juvenile Priors Counts + 0.1)",
    x = "Recidivated",
    y = "log10(Juvenile Priors Counts + 0.1)",
    fill = "Recidivated"
  )
p47 + p48 + p49 + p50 + plot_annotation(title = "Juvenile Priors Counts", theme=theme_custom()) + plot_layout(guides = "collect")

Figure 23 shows us that juvPriorsCount has little to no significance to whether or not a female inmate will reoffend. This tells us that we can exclude this from our model.

Figure 24

### Age

p51 <- femaleTraining %>%
  jjplotDensity(x = age, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Age",
    x = "Juvenile Priors Counts"
  )

p52 <- femaleTraining %>%
  jjplotBoxplot(x = isRecid, y = age, fill = fct_recode(as.factor(isRecid), Yes = "1", No = "0")) +
  theme(legend.position = "right") +
  labs(
    title = "Age",
    x = "Recidivated",
    y = "Age",
    fill = "Recidivated"
  )

p51 / p52 + plot_annotation(theme=theme_custom())

Figure 24 shows us that age is a good indicator of recidivation in female inmate populations. We will likely include this value in our model.

Figure 25

### Risk Recid Score
femaleTraining %>%
  jjplotDensity(x = riskRecidDecileScore, fill = as.factor(isRecid), color = as.factor(isRecid)) +
  labs(
    title = "Female Recidivation by Risk Recidivation Decile Score",
    x = "Risk Recidivation Decile Score"
  )

Figure 25 shows a strong correlation between whether or not an individual female inmate reoffended based on their risk of recidivation decile score. This could potentially be a predictor in our model which may be refined out later. (Blue means an inmate reoffended wheras red means an inmate does not reoffend).

Figure 26

### Colinearity Check

p53 <- ggplot(femaleTraining, aes(x = logDaysInJail, y = logPriorsCount, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Days In Jail) vs log10(Priors Count)",
    x = "log10(Days In Jail)",
    y = "log10(Priors Count)",
    color = "Recidivated"
  ) +
  theme_custom()

p54 <- ggplot(femaleTraining, aes(x = logDaysInJail, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Days In Jail) vs Age",
    x = "log10(Days In Jail)",
    y = "Age",
    color = "Recidivated"
  ) +
  theme_custom()

p55 <- ggplot(femaleTraining, aes(x = logPriorsCount, y = age, color = fct_recode(as.factor(isRecid), Yes = "1", No = "0"))) +
  geom_point() +
  labs(
    title = "log10(Priors Count) vs Age",
    x = "log10(Priors Count)",
    y = "Age",
    color = "Recidivated"
  ) +
  theme_custom()

p53 / (p54 + p55) + plot_annotation(title = "Colinearity Check", theme=theme_custom()) + plot_layout(guides = "collect")

Figure 26 tests the continuous variables which are potentially included in our model. This plot clearly shows no correlation between any of the three included variables meaning that they are not colinear.

The Female Model

This is the model which we developed for better predicting recidivism in female populations as compared to mixed populations:

femaleModel <- glm(isRecid ~ age + priorsCount, data = femaleTraining, family = binomial)

femalePredictTrain <- getPredict(femaleTraining, femaleModel) %>%
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
femaleMatrixTrain <- table(femalePredictTrain$isRecid, femalePredictTrain$prediction)

femalePredictTest <- getPredict(recidTesting, femaleModel) %>%
  mutate(prediction = ifelse(recidPredict < 0.5, "Did Not Reaffend", "Reaffended"))
femaleMatrixTest <- table(femalePredictTest$isRecid, femalePredictTest$prediction)

#stored for use in markdown
trainList <- checkModel(recidTraining,femaleMatrixTrain)
trainError <- trainList$error
trainPVal <- trainList$pValue

testList <- checkModel(recidTesting,femaleMatrixTest)
testError <- testList$error
testPVal <- testList$pValue

bestModelF <- testError

matrix = femaleMatrixTest
matrix1 = femaleMatrixTrain
manualErrorValueTest <- (matrix[1,2]+matrix[2,1])/nrow(recidTesting)
manualErrorValueTrain <- (matrix1[1,2]+matrix1[2,1])/nrow(recidTraining)

coeffs = femaleModel$coefficients

nullHyp = (matrix[2,1]+matrix[2,2])/nrow(recidTesting)

canWe = ifelse(testPVal > 0.05, "can", "cannot")

continueAnalysis = ifelse(testPVal > 0.05, "should continue analyzing this data and other related data to create a statistically significant model", "should proceed to propose a new hypothesis which we may test in future analysis")

\[willReoffend_{female} = e^{-0.4667208} \times\] \[e^{-0.0299595 \cdot age} \times e^{0.1914525 \cdot priorsCount}\]

Some model coefficients listed above are obscure in their effect. For example, the coefficients of the logged values indicate that the prediction will go up or down according to every power of ten included in the original variable’s value. This is in contrast to the coefficient for the included linear term, age, which indicates that the prediction will change multiplicatively by the exponentiation of -0.0299595 likelihood to reoffend for every year of age the inmate has. These variables are all exponentiated to reflect the model utilizing the process of logistic regression.

This model, when demonstrated on the data it was trained on, has a prediction error of \(0.0941065\) meaning that approximately \(9.41\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(9.41\%\) of inmates will be predicted to reoffend when they do not or will be predicted not to reoffend when they do.

This is information which can be found in our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. The combination of these two numbers over the total number of data points gives us our error measurement. \(errorProportion = \frac{271 + 26}{3156}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.0941065.

This model, when demonstrated on testing data, has a prediction error of \(0.2835443\) meaning that approximately \(28.35\%\) of the predictions made with this model will either be false positives or false negatives. In this case that means \(28.35\%\) of inmates will be predicted to reoffend when they do not OR predicted not to reoffend when they do.

This is derived from our confusion matrix which was created upon the testing of this model. Our error rate for those inmates who did not reoffend can be shown as a proportion of the data which was predicted incorrectly as reoffending whereas the error rate for those inmates that did reoffend can be shown as the proportion of the data which was predicted incorrectly as not reoffending. \(errorPercentage = \frac{196 + 28}{790}\). Here, the first term in the numerator is the number of people in the data frame that did not reoffend but were predicted to reoffend while the second term in the numerator is the number of people who did reoffend but were predicted as not reoffending. The term in the denominator is the sample size contained within this specific dataframe. This ultimately results in the error value 0.2835443.

This model’s p-value regarding the testing data is \(0.0061037\). When this value is below 0.05, that means the model fitted to the data is statistically significant as it has a less than \(5\%\) chance to occur randomly given random data whereas if it is above 0.05 it is not statistically significant because it has a greater than \(5\%\) chance to occur randomly given random data. This p-value was created by a function located at the top of the markdown; however, it was calculated with a one-tail hypothesis test comparing two proportions. The null model states that \(32.53\%\) of inmates will reoffend as this is the percentage of inmates in the data that do reoffend. The alternate hypothesis is that the recidivation rate is something other than this. The two proportions used for this hypothesis test are the error, which is the number of people who were incorrectly predicted over the number of total people, and the p-hat value, which is the total number of people who reoffended in the selected data set over the total number of people in that data set. The standard error used was from a standard calculation of standard error given population size and a single proportion. The p-value acquired from this hypothesis test tells us we cannot reject the null hypothesis meaning we should proceed to propose a new hypothesis which we may test in future analysis.

Comparison

These two models together make our most effective prediction model for recidivation in inmates. We noticed that males and females had the same predictors aside from the inclusion of daysInJail for males. It is curious that men’s recidivation rate goes down as they spend longer in jail (exponentiation of a value less than one which multiplies into the model). It is also interesting that men’s likelihood to reoffend goes up with age while women’s goes down. Overall these models are fairly similar though.

Task 4: Another Story in Data Science Regarding Criminal Justice: Predicting Jail Time!

Here we have a story chosen by us. We would like to know if data science can be used in conjunction with this criminal justice data to predict how long a person will stay in prison. A worthy inquiry if we do say so ourselves. We will be testing this through a multiple regression modeling process to see if there is an effective model hidden in the data used for the rest of this analysis.

Relevant Data Visualization

Figure 27

## Days in Jail vs Priors Count

proirsModel <- lm(logDaysInJail ~ logPriorsCount, jailTimeTraining)

jjplotLogPoint(data = jailTimeTraining,x = logPriorsCount,y=daysInJail,color=recidCat,model=proirsModel) +
  theme_custom() +
  theme(legend.position = "none") +
  labs(
    title="Days in Jail vs log10(Priors Count)",
    x="log10(Priors Count)",
    y="Days in Jail"
  )

Figure 28

## Days in Jail vs Age

ageModel <- lm(logDaysInJail ~ age, jailTimeTraining)

jjplotLogPoint(data = jailTimeTraining,x = age,y=daysInJail,color=recidCat,model=ageModel) +
  theme_custom() +
  theme(legend.position = "none")

Figure 29

## Days in Jail vs Juvenile Priors Count

juvModel <- lm(logDaysInJail ~ juvCount, jailTimeTraining)

jjplotLogPoint(data = jailTimeTraining,x = priorsCount,y=daysInJail,color=recidCat,model=juvModel) +
  theme_custom() +
  theme(legend.position = "none")

Figure 30

## Days in Jail vs Charge Description Word Count
chargeDescModel <- lm(logDaysInJail ~ chargeDescCount, jailTimeTraining)

jjplotLogPoint(data = jailTimeTraining,x = chargeDescCount,y=daysInJail,color=recidCat,model=chargeDescModel) +
  theme_custom() +
  theme(legend.position = "none")

Figure 31

## Days in Jail vs Charge Degree

jjplotDensity(data = jailTimeTraining, x=logDaysInJail,fill=recidCat) +
  facet_wrap(~chargeDegree) +
  labs(
    title = "Charge Degree",
    x="Log10(Day In Jail)"
  )

Figure 32

## Days in Jail vs Sex

jjplotDensity(data = jailTimeTraining, x=logDaysInJail,fill=recidCat) +
  facet_wrap(~sex) +
  labs(
    title = "Sex",
    x="Log10(Days In Jail)"
  )

The Model

Overview

The following is our multiple regression model attempting to predict daysInJail by other factors in the dataset:

# Best Subset

jailTimeEverythingModel <- lm(logDaysInJail ~ sex + age + chargeDegree + priorsCount + juvCount + logPriorsCount + logJuvCount + chargeDescCount + logChargeDescCount + riskRecidDecileScore + riskViolenceDecileScore, data=jailTimeTraining)
# commented out for markdown
# ols_step_best_subset(jailTimeEverythingModel)

bestJailTimeModel <-  lm(logDaysInJail ~ sex + chargeDegree + logPriorsCount, data=jailTimeTraining)

coeffs = bestJailTimeModel$coefficients

jailTimeTestingPredict <- 10^predict.lm(bestJailTimeModel,newdata = jailTimeTesting)

jailTimeTrainingPredict <- 10^predict.lm(bestJailTimeModel,newdata = jailTimeTraining)

corr <- cor(jailTimeTestingPredict,jailTimeTesting$daysInJail)

rmse <- RMSE(jailTimeTestingPredict,jailTimeTesting$daysInJail)

trainrmse <- RMSE(jailTimeTrainingPredict,jailTimeTraining$daysInJail)

\[daysInJail = 0.9855072 + 0.0979475 \cdot sex_{male} + -0.2570472 \cdot chargeDeg_{F2} +\] \[-0.3802251 \cdot chargeDeg_{F3} + 0.8587049 \cdot chargeDeg_{F5} + 0.3272061 \cdot chargeDeg_{F7} +\] \[-0.5431983 \cdot chargeDeg_{M1} + -0.4983351 \cdot chargeDeg_{M2} +\] \[ -0.4315611 \cdot chargeDeg_{M03} + -0.9069859 \cdot chargeDeg_{NI0} + 0.1944354 \cdot logPriorsCount\] The model above has some tricky terms in it due to most of the terms being variants of a single categorical variable. For all instances of sex and chargeDeg terms, the coefficient will only be added to the value of the model if the data point the model is being used on has data which matches the term’s subscript. The last term, logPriorsCount will add \(0.1944354\) to the total for every factor of \(10\) measured in priorsCount.

This model started with all variables and was refined using the olsrr library to find the best subset model. After that, we constructed our final model and made our predictions to measure RMSE and correlation coefficient with.

Diagnostic Plots

autoplot(bestJailTimeModel, 1:2) + theme_custom()

These diagnostic plots above show us that this model is absolutely atrocious. The residual plot shows a moderately clear trend through all of the data’s range and the Q-Q plot only meets the guideline in the center of the plot at zero and the immediately surrounding range. The plot quickly leaves the linear guide when moving left or right from (0,0). We can see clearly that this data should not be modeled this way or has predictors used in the proprietary algorithm which are not included in the data.

Correlation Coefficient \(= 0.178596\)
A correlation coefficient of \(0.178596\) (on the testing data) indicates that \(17.86\%\) of the variation in recidivation score is explained by the model. The closer this is to \(1\) or \(-1\), the better the model fits the data provided and, therefore, the more likely it is to be effective at predicting jailTime. A value of 0.178596 is quite poor. This model does not predict for most of the testing data and should not be used to predict on new data.

In relation to our training data, this model has an RMSE of 64.0463482. This means that our predictions have a residual mean square error of 64.0463482 days. An average of 64 points of error on the training data is quite bad. Let’s move onto the testing RMSE.

In relation to our testing data, this model has an RMSE of 80.3885757. This means that our predictions have a residual mean square error of 80.3885757 days. An average of 80.4 points of error is quite bad. The testing and training RMSE values don’t differ greatly in context of the data range and is poor in quality which together indicate that there are predictors missing from this data which are useful in predicting this value. Sadly we did not find a better model for this score which indicates that this data likely does not contain the information to predict the length of a sentence.

Ethical Implications of Classification Models in the Criminal Justice System

Through the process of modeling statistics from prison inmate data including proprietary tests administered to inmates by a third party company, we learned that there are some rather eye-opening trends regarding the proprietary tests and race. As can be seen in Figure 10 mentioned earlier, there is an alarming difference in the densities of recidivation between the black population and other races. This indicates to us that the system used to predict whether or not an inmate will reoffend could be inherently racist. We, for this reason, chose to leave race out of all of the models in this analysis aside from the models which attempt to predict riskRecidDecileScore and riskRecidViolenceScore as we were attempting to emulate another model making process. It appeared as though the data necessary to predict these values was not included fully in the data we were given which means that, in conjunction with those missing variables, race could still be a major factor in the model and the process of classifying these inmates could take into account a factor, namely race, which would be unfair to classify by given that there are no concrete differences aside from the actual race of a person when it comes to humanity and ethicality. Classification models can be helpful and ethical in certain situations; however, obvious discriminatory factors should typically be excluded when attempting to predict a facet of a person which could impact their future.

Conclusion

Throughout the process of modeling across this data, we have concluded that splitting the data by gender proves for the most effective model with an error of \(0.2696203\%\) for males and \(0.2835443\%\) for females. The ethicality of race being included in predictors like this is ambiguous. Should this prediction affect a person’s future, then it is entirely unethical to predict by race not to mention predicting by sex. However, if this prediction does not have a concrete impact on a person’s future then we believe that factors such as these can be used with discretion and heavy justification for the purpose of significantly higher accuracy. Overall we cannot be sure whether or not the algorithm used to predict the proprietary scores used these factors or if they used other, privileged information from the inmate surveys that we do not have access to. Ultimately, after all of this analysis, we have a best model to predict recidivation likelihood based only a few factors and split by sex. Other than this we cannot conclude much without further analysis, more data, and future work.

References

ProPublica, Angwin, J., Larson, J., Mattu, S., & Kirchner, L. (2016, May 23). Machine Bias. ProPublica; ProPublica. https://www.propublica.org/article/machine-bias-risk-assessments-in-criminal-sentencing.